# Read the Excel file
library(readxl)
library(tidyverse)
#> ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
#> ✔ dplyr     1.1.4     ✔ readr     2.1.5
#> ✔ forcats   1.0.0     ✔ stringr   1.5.1
#> ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
#> ✔ lubridate 1.9.3     ✔ tidyr     1.3.1
#> ✔ purrr     1.0.2     
#> ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
#> ✖ dplyr::filter() masks stats::filter()
#> ✖ dplyr::lag()    masks stats::lag()
#> ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(janitor)
#> 
#> Attaching package: 'janitor'
#> 
#> The following objects are masked from 'package:stats':
#> 
#>     chisq.test, fisher.test
library(httpgd)
library(languageserver)
library(grid)
library(ggplot2)
library(shadowtext)
library(plotly)
#> 
#> Attaching package: 'plotly'
#> 
#> The following object is masked from 'package:ggplot2':
#> 
#>     last_plot
#> 
#> The following object is masked from 'package:stats':
#> 
#>     filter
#> 
#> The following object is masked from 'package:graphics':
#> 
#>     layout
hgd()
#> httpgd server running at:
#>   http://127.0.0.1:62328/live?token=GQtx2NqN

setwd("C:/Users/jason/OneDrive - University of Cambridge/sem-lab-code/practice-code")


raw_tib <- read_excel("../data-depo/bailey-299-driver-genes-only.xlsx", sheet = 2)
#> New names:
#> • `` -> `...2`
#> • `` -> `...3`
#> • `` -> `...4`
#> • `` -> `...5`
#> • `` -> `...6`
#> • `` -> `...7`
#> • `` -> `...8`
#> • `` -> `...9`
#> • `` -> `...10`
#> • `` -> `...11`
#> • `` -> `...12`

# Convert correct row to column name
raw_tib <- raw_tib %>% row_to_names(row_number = 3)

print(raw_tib)
#> # A tibble: 739 × 12
#>    Gene   Cancer   KEY        Tumor suppressor or …¹ Decision `Tissue Frequency`
#>    <chr>  <chr>    <chr>      <chr>                  <chr>    <chr>             
#>  1 ABL1   PANCAN   ABL1_PANC… <NA>                   rescued  NA                
#>  2 ACVR1  UCEC     ACVR1_UCEC oncogene               official 5.303030303030299…
#>  3 ACVR1B PANCAN   ACVR1B_PA… possible tsg           official NA                
#>  4 ACVR2A COADREAD ACVR2A_CO… tsg                    official 2.848101265822779…
#>  5 ACVR2A LIHC     ACVR2A_LI… possible tsg           official 3.10734463276836E…
#>  6 ACVR2A PANCAN   ACVR2A_PA… possible tsg           official NA                
#>  7 AJUBA  PANCAN   AJUBA_PAN… tsg                    official NA                
#>  8 AJUBA  HNSC     AJUBA_HNSC tsg                    official 6.374501992031869…
#>  9 AKT1   CESC     AKT1_CESC  oncogene               official 2.554744525547450…
#> 10 AKT1   PRAD     AKT1_PRAD  oncogene               official 6.289308176100630…
#> # ℹ 729 more rows
#> # ℹ abbreviated name: ¹​`Tumor suppressor or oncogene prediction (by 20/20+)`
#> # ℹ 6 more variables: `Pancan Frequency` <chr>, `Consensus Score` <chr>,
#> #   `Correlation adusted score` <chr>, Novel <chr>, `Rescue Notes` <chr>,
#> #   `Note about previous publication` <chr>
tib <- raw_tib[-c(1:2), ]

# Filter only BLCA
blca_df <- tib %>% filter(Cancer == "BLCA")

# Arrange by the 'Tissue Frequency' column in descending order using BACKTICKS
blca_sorted <- blca_df %>% arrange(`Tissue Frequency`)

print(blca_sorted)
#> # A tibble: 45 × 12
#>    Gene   Cancer KEY         Tumor suppressor or o…¹ Decision `Tissue Frequency`
#>    <chr>  <chr>  <chr>       <chr>                   <chr>    <chr>             
#>  1 FAT1   BLCA   FAT1_BLCA   tsg                     official 0.116580310880829 
#>  2 ERBB3  BLCA   ERBB3_BLCA  oncogene                official 0.119170984455959 
#>  3 CREBBP BLCA   CREBBP_BLCA tsg                     official 0.126943005181346…
#>  4 ERBB2  BLCA   ERBB2_BLCA  oncogene                official 0.129533678756477…
#>  5 SPTAN1 BLCA   SPTAN1_BLCA tsg                     official 0.132124352331605…
#>  6 ATM    BLCA   ATM_BLCA    possible tsg            official 0.134715025906736…
#>  7 EP300  BLCA   EP300_BLCA  tsg                     official 0.137305699481865 
#>  8 ELF3   BLCA   ELF3_BLCA   possible tsg            official 0.139896373056995…
#>  9 FGFR3  BLCA   FGFR3_BLCA  oncogene                official 0.152849740932642 
#> 10 STAG2  BLCA   STAG2_BLCA  tsg                     official 0.155440414507771…
#> # ℹ 35 more rows
#> # ℹ abbreviated name: ¹​`Tumor suppressor or oncogene prediction (by 20/20+)`
#> # ℹ 6 more variables: `Pancan Frequency` <chr>, `Consensus Score` <chr>,
#> #   `Correlation adusted score` <chr>, Novel <chr>, `Rescue Notes` <chr>,
#> #   `Note about previous publication` <chr>
# Turn TF into numbers
blca_sorted$`Tissue Frequency` <- as.numeric(blca_sorted$`Tissue Frequency`)
blca_sorted$`Correlation adusted score` <- as.numeric(blca_sorted$`Correlation adusted score`)

blca_sorted <- blca_sorted %>% arrange(`Tissue Frequency`)

# Set the factor levels for Gene based on Tissue Frequency
blca_sorted$Gene <- factor(blca_sorted$Gene, levels = blca_sorted$Gene)

view(blca_sorted)

# Create the ggplot
plt_gg <- ggplot(blca_sorted) +
  geom_col(aes(x = Gene, y = `Tissue Frequency`), fill = "blue", width = 0.6) +
  geom_text(aes(x = Gene, y = `Tissue Frequency`, label = sprintf("%0.2f", round(`Tissue Frequency`, digits = 2))),
    hjust = -0.1, size = 2
  ) +
  coord_flip() +
  theme_minimal() +
  labs(title = "Tissue Frequencies of Driver Genes in Bladder Cancer") +
  theme(
    axis.text.y = element_text(size = 5, margin = margin(r = -20)),
    axis.text.x = element_text(size = 5)
  )

plot(plt_gg)



blca_sorted$`Tissue Frequency` <- round(blca_sorted$`Tissue Frequency`, 4)
blca_sorted$`Correlation adusted score` <- round(blca_sorted$`Correlation adusted score`, 3)

blca_sorted$hover_text <- paste(
  "Gene:", blca_sorted$Gene,
  "<br>Tissue Frequency:",
  blca_sorted$`Tissue Frequency`,
  "<br>Correlation Adjusted Consensus Score:",
  blca_sorted$`Correlation adusted score`,
  "<br>TSG or Oncogene:",
  blca_sorted$`Tumor suppressor or oncogene prediction (by 20/20+)`
) # nolint

# Horizontal bar plot with color intensity based on correlation adjusted consensus score
plt <- plot_ly(blca_sorted,
  x = ~`Tissue Frequency`,
  y = ~Gene,
  type = "bar",
  orientation = "h",
  marker = list(
    color = ~`Correlation adusted score`,
    colorscale = "Viridis",
    showscale = TRUE,
    colorbar = list(title = list(text = "Correlation Adjusted Consensus Score", side = "right"))
  ),
  text = ~hover_text,
  hoverinfo = "text"
) %>%
  layout(
    xaxis = list(title = "Tissue Frequency"),
    yaxis = list(
      title = "Gene",
      tickmode = "array",
      tickvals = ~Gene,
      ticktext = ~Gene,
      tickfont = list(size = 8)
    ),
    bargap = 0.3
  )

plt

plot(plt_gg)